This project demonstrates the process of exploratory data analysis (EDA) using R functions and visualization tools. It is an individual effort project aimed at understanding and analyzing a dataset sourced from public repositories. The project involves data cleaning, exploration, visualization, and drawing insights from the data.
# Load required libraries
library(ggplot2)
library(dplyr)
library(tidyverse)
library(forcats)
library(plotly)
# set the the plot theme
us_data_theme <- theme(legend.text = element_text(size = 6),plot.title = element_text(size = 6),legend.title = element_text(size = 6),axis.title.y = element_text(size = 7),axis.title.x = element_text(size = 7),axis.text.y = element_text(size = 6),axis.text.x = element_text(size = 6),)
us_data_df <- read.csv("us_data_2000.csv")
# inspect the data
glimpse(us_data_df)
## Rows: 2,000
## Columns: 57
## $ MINE_ID <int> 100003, 100003, 100008, 100011, 100011, 100011, 10…
## $ CONTROLLER_ID <chr> "41044", "41044", "M31753", "M11763", "M11763", "M…
## $ CONTROLLER_NAME <chr> "Lhoist Group", "Lhoist Group", "Alan B Cheney", …
## $ OPERATOR_ID <chr> "L13586", "L13586", "L31753", "L17074", "L17074", …
## $ OPERATOR_NAME <chr> "Lhoist North America ", "Lhoist North America ", …
## $ CONTRACTOR_ID <chr> "", "", "", "", "", "", "", "", "", "", "", "", ""…
## $ DOCUMENT_NO <dbl> 2.2e+11, 2.2e+11, 2.2e+11, 2.2e+11, 2.2e+11, 2.2e+…
## $ SUBUNIT_CD <int> 3, 30, 30, 30, 30, 30, 30, 30, 3, 3, 30, 30, 30, 3…
## $ SUBUNIT <chr> "STRIP, QUARY, OPEN PIT", "MILL OPERATION/PREPARAT…
## $ ACCIDENT_DT <int> 40982, 39090, 39998, 36672, 38374, 38805, 39729, 4…
## $ CAL_YR <int> 2012, 2007, 2009, 2000, 2005, 2006, 2008, 2012, 20…
## $ CAL_QTR <int> 1, 1, 3, 2, 1, 1, 4, 2, 3, 1, 1, 4, 3, 3, 2, 1, 3,…
## $ FISCAL_YR <int> 2012, 2007, 2009, 2000, 2005, 2006, 2009, 2012, 20…
## $ FISCAL_QTR <int> 2, 2, 4, 3, 2, 2, 1, 3, 4, 2, 2, 1, 4, 4, 3, 2, 4,…
## $ ACCIDENT_TIME <int> 945, 1105, 1000, 1100, 1430, 1130, 430, 930, 730, …
## $ DEGREE_INJURY_CD <chr> "5", "6", "3", "5", "5", "5", "3", "5", "5", "4", …
## $ DEGREE_INJURY <chr> "DAYS RESTRICTED ACTIVITY ONLY", "NO DYS AWY FRM W…
## $ FIPS_STATE_CD <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ UG_LOCATION_CD <chr> "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", …
## $ UG_LOCATION <chr> "NO VALUE FOUND", "NO VALUE FOUND", "NO VALUE FOUN…
## $ UG_MINING_METHOD_CD <chr> "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", …
## $ UG_MINING_METHOD <chr> "NO VALUE FOUND", "NO VALUE FOUND", "NO VALUE FOUN…
## $ MINING_EQUIP_CD <chr> "24", "28", "?", "?", "?", "?", "28", "24", "?", "…
## $ MINING_EQUIP <chr> "Front-end loader, Tractor-shovel, Payloader, High…
## $ EQUIP_MFR_CD <chr> "119", "121", "?", "?", "?", "?", "121", "119", "?…
## $ EQUIP_MFR_NAME <chr> "Not on this list", "Not Reported", "NO VALUE FOUN…
## $ EQUIP_MODEL_NO <chr> "22321", "", "", "?", "", "", "", "S160", "?", "",…
## $ SHIFT_BEGIN_TIME <int> 600, 700, 600, 700, 700, 700, 2300, 700, 700, 1800…
## $ CLASSIFICATION_CD <chr> "12", "10", "18", "9", "9", "21", "10", "18", "9",…
## $ CLASSIFICATION <chr> "POWERED HAULAGE", "HANDTOOLS (NONPOWERED)", "SLIP…
## $ ACCIDENT_TYPE_CD <chr> "21", "8", "30", "27", "38", "26", "29", "30", "30…
## $ ACCIDENT_TYPE <chr> "CGHT I, U, B, MVNG & STTN OBJS", "STRUCK BY, NEC"…
## $ NO_INJURIES <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ TOT_EXPER <dbl> 4.35, 0.02, 10.00, NA, 0.87, 5.62, 1.23, 21.54, NA…
## $ MINE_EXPER <dbl> 4.35, 0.02, 2.15, 0.23, 0.87, 5.62, 1.23, 21.54, N…
## $ JOB_EXPER <dbl> 0.67, 0.02, 2.15, 0.23, 0.38, 5.62, 1.23, 21.54, N…
## $ OCCUPATION_CD <chr> "374", "374", "374", "374", "382", "304", "374", "…
## $ OCCUPATION <chr> "Warehouseman, Bagger, Palletizer/Stacker, Store k…
## $ ACTIVITY_CD <chr> "28", "30", "13", "28", "96", "39", "30", "23", "2…
## $ ACTIVITY <chr> "HANDLING SUPPLIES/MATERIALS", "HAND TOOLS (NOT PO…
## $ INJURY_SOURCE_CD <chr> "76", "46", "117", "4", "21", "2", "46", "76", "3"…
## $ INJURY_SOURCE <chr> "SURFACE MINING MACHINES", "AXE,HAMMER,SLEDGE", "G…
## $ NATURE_INJURY_CD <chr> "160", "180", "330", "330", "130", "330", "330", "…
## $ NATURE_INJURY <chr> "CONTUSN,BRUISE,INTAC SKIN", "CUT,LACER,PUNCT-OPN …
## $ INJ_BODY_PART_CD <chr> "700", "100", "520", "420", "330", "420", "450", "…
## $ INJ_BODY_PART <chr> "MULTIPLE PARTS (MORE THAN ONE MAJOR)", "HEAD,NEC"…
## $ SCHEDULE_CHARGE <int> 0, 0, 0, NA, 0, 0, 0, 0, NA, 0, 0, NA, NA, 0, 0, 0…
## $ DAYS_RESTRICT <int> 8, 0, 0, 5, 5, 3, 0, 21, 10, 19, 0, NA, NA, NA, 0,…
## $ DAYS_LOST <int> 0, 0, 9, NA, 0, 0, 0, 0, NA, 13, 1, NA, NA, NA, 0,…
## $ TRANS_TERM <chr> "N", "N", "N", "N", "N", "N", "", "N", "N", "N", "…
## $ RETURN_TO_WORK_DT <chr> "03/26/2012", "39326", "07/14/2009", "36531", "383…
## $ IMMED_NOTIFY_CD <chr> "? ", "? ", "? ", "13", "? ", "? ", "? ", "? ", "1…
## $ IMMED_NOTIFY <chr> "NO VALUE FOUND", "NO VALUE FOUND", "NO VALUE FOUN…
## $ INVEST_BEGIN_DT <chr> "", "", "", "", "01/22/2005", "", "", "", "", "", …
## $ NARRATIVE <chr> "Employee was cleaning up at the Primary Crusher w…
## $ CLOSED_DOC_NO <dbl> NA, NA, 3.2e+11, 3.2e+11, NA, 3.2e+11, NA, 3.2e+11…
## $ COAL_METAL_IND <chr> "M", "M", "M", "M", "M", "M", "M", "M", "M", "M", …
From the above summary we can see that there are 57 variables and 2000 observations.Now we will check the blank/NA values for all the variables.
apply(is.na(us_data_df), 2, sum)
## MINE_ID CONTROLLER_ID CONTROLLER_NAME OPERATOR_ID
## 0 0 0 0
## OPERATOR_NAME CONTRACTOR_ID DOCUMENT_NO SUBUNIT_CD
## 0 0 0 0
## SUBUNIT ACCIDENT_DT CAL_YR CAL_QTR
## 0 0 0 0
## FISCAL_YR FISCAL_QTR ACCIDENT_TIME DEGREE_INJURY_CD
## 0 0 0 0
## DEGREE_INJURY FIPS_STATE_CD UG_LOCATION_CD UG_LOCATION
## 0 0 0 0
## UG_MINING_METHOD_CD UG_MINING_METHOD MINING_EQUIP_CD MINING_EQUIP
## 0 0 0 0
## EQUIP_MFR_CD EQUIP_MFR_NAME EQUIP_MODEL_NO SHIFT_BEGIN_TIME
## 0 0 1 9
## CLASSIFICATION_CD CLASSIFICATION ACCIDENT_TYPE_CD ACCIDENT_TYPE
## 0 0 0 0
## NO_INJURIES TOT_EXPER MINE_EXPER JOB_EXPER
## 0 360 333 328
## OCCUPATION_CD OCCUPATION ACTIVITY_CD ACTIVITY
## 0 0 0 0
## INJURY_SOURCE_CD INJURY_SOURCE NATURE_INJURY_CD NATURE_INJURY
## 0 0 0 0
## INJ_BODY_PART_CD INJ_BODY_PART SCHEDULE_CHARGE DAYS_RESTRICT
## 0 0 673 543
## DAYS_LOST TRANS_TERM RETURN_TO_WORK_DT IMMED_NOTIFY_CD
## 422 0 0 0
## IMMED_NOTIFY INVEST_BEGIN_DT NARRATIVE CLOSED_DOC_NO
## 0 0 0 1138
## COAL_METAL_IND
## 0
From the above result we can see that some of the variables have large number of blank/NA values, we will clean this data further.We will ignore the not required columns for our EDA process. Now we will check the individual column and change the corresponding values for the column as per the values described in the given data dictionary file,this will help us to understand the meaning of data correctly. For e.g. COAL_METAL_IND is the variable which has the two categories “M” and “c” which represents the values coal mine and Metal Non Metal mine.we will rename the values for “M and”C” as per the given data dictionary.
us_data_df$COAL_METAL_IND[us_data_df$COAL_METAL_IND == 'C'] <- 'Coal Mine'
us_data_df$COAL_METAL_IND[us_data_df$COAL_METAL_IND == 'M'] <- 'Metal Non Metal Mine'
us_data_df %>%
pull(COAL_METAL_IND) %>%
unique()
## [1] "Metal Non Metal Mine" "Coal Mine"
# Analyze the total no. of injuries for the mine categories for the Fiscal year
# Line plot with multiple groups
# Grouping data and calculating total injuries
df_grp_mine_injuries <- us_data_df %>%
group_by(FISCAL_YR, COAL_METAL_IND) %>%
summarise(Total_injuries = sum(NO_INJURIES), .groups = 'drop')
# Plotting
p <- ggplot(data = df_grp_mine_injuries, aes(x = FISCAL_YR, y = Total_injuries, color = COAL_METAL_IND, group = COAL_METAL_IND, text = paste("Fiscal Year:", FISCAL_YR, "<br> Mine Category:", COAL_METAL_IND, "<br> Total Injuries:", Total_injuries))) +
geom_line() +
geom_point() +
ggtitle("Total Injuries by Mine Category Over Fiscal Years") +
xlab("Fiscal Year") +
ylab("Total Injuries") +
scale_color_manual(values = c("#2E86C1", "#f68060")) + # Customizing line colors
us_data_theme +
theme(
plot.title = element_text(size = 12, hjust = 0.5, vjust = 1.5),
axis.text = element_text(size = 8),
axis.title = element_text(size = 10)
)
# Convert ggplot to plotly
p <- ggplotly(p, tooltip = "text")
p
Total Injuries by Mine Category Over Fiscal Years (Line Plot):
# Analyze the total injuries based on the location
# bar plot
# Grouping data and calculating total injuries
df_grp_mine_injuries <- us_data_df %>%
group_by(INJ_BODY_PART) %>%
summarise(Total_injuries = sum(NO_INJURIES), .groups = 'drop')
# Reordering factor levels by total injuries
df <- df_grp_mine_injuries %>%
mutate(INJ_BODY_PART = fct_reorder(INJ_BODY_PART, Total_injuries))
# Plotting
p <- ggplot(df, aes(x = INJ_BODY_PART, y = Total_injuries, text = paste("Body Part: ", INJ_BODY_PART, "<br>Total Injuries: ", Total_injuries))) +
geom_bar(stat = "identity", fill = "#2E86C1", alpha = 0.9, width = 0.4) +
coord_flip() +
xlab("Injured Body Part") +
ylab("Total Injuries") +
ggtitle("Count of Injuries by Body Part") +
us_data_theme +
theme(
plot.title = element_text(size = 12, hjust = 0.5, vjust = 1.5),
axis.text = element_text(size = 8),
axis.title = element_text(size = 10)
)
# Convert ggplot to plotly
p <- ggplotly(p, tooltip = "text")
p
Count of Injuries by Body Part (Bar Plot with Tooltip):
# Replace 'NO VALUE FOUND' with 'unknown' in UG_MINING_METHOD
us_data_df$UG_MINING_METHOD[us_data_df$UG_MINING_METHOD == 'NO VALUE FOUND'] <- 'unknown'
# Grouping data and calculating total injuries
df_grp_mine_injuries <- us_data_df %>%
group_by(FISCAL_YR, UG_MINING_METHOD) %>%
summarise(Total_injuries = sum(NO_INJURIES), .groups = 'drop')
# Arrange data by fiscal year
df_grp_mine_injuries <- df_grp_mine_injuries %>%
arrange(FISCAL_YR)
# Plotting
p <- ggplot(df_grp_mine_injuries, aes(x = FISCAL_YR, y = Total_injuries, fill = UG_MINING_METHOD, text = paste("Mining Method: ", UG_MINING_METHOD, "<br> Fiscal Year: ", FISCAL_YR, "<br> Total Injuries: ", Total_injuries))) +
geom_col(position = "stack") +
ggtitle("Stacked Area Plot for Underground Mining Methods") +
xlab("Fiscal Year") +
ylab("Total Injuries") +
us_data_theme +
theme(
plot.title = element_text(size = 10, hjust = 0.5, vjust = 4),
legend.key.height = unit(0.3, 'cm'),
legend.key.width = unit(0.4, 'cm')
)
# Convert ggplot to plotly
p <- ggplotly(p, tooltip = "text")
p
Stacked Area Plot for Underground Mining Methods:
# Grouping data and calculating total injuries
df_grp_mine_injuries <- us_data_df %>%
group_by(CAL_QTR, COAL_METAL_IND) %>%
summarise(Total_injuries = sum(NO_INJURIES), .groups = 'drop')
# Plotting
p <- ggplot(df_grp_mine_injuries, aes(x = CAL_QTR, y = Total_injuries, fill = COAL_METAL_IND, text = paste("Mine Type: ", COAL_METAL_IND, "<br>Calendar Quarter: ", CAL_QTR, "<br>Total Injuries: ", Total_injuries))) +
geom_bar(stat = "identity", position = "dodge") +
ggtitle("Total Injuries by Mine Type per Calendar Quarter") +
xlab("Calendar Quarter") +
ylab("Total Injuries") +
us_data_theme +
theme(plot.title = element_text(size = 10, hjust = 0.5, vjust = 4))
# Convert ggplot to plotly
p <- ggplotly(p, tooltip = "text")
p
Total Injuries by Mine Type per Calendar Quarter (Grouped Bar Plot with Tooltip):
# Grouping data and calculating the count of injured employees
df_trans_term <- us_data_df %>%
group_by(FISCAL_YR, TRANS_TERM) %>%
summarise(count = n(), .groups = 'drop')
# Mapping TRANS_TERM values to labels
df_trans_term$TRANS_TERM <- ifelse(df_trans_term$TRANS_TERM == "Y", "Terminated/Transferred",
ifelse(df_trans_term$TRANS_TERM == "N", "Not Terminated/Transferred", "Unknown"))
# Plotting as a scatter plot with tooltips
p <- ggplot(data = df_trans_term, aes(x = FISCAL_YR, y = count, color = TRANS_TERM, text = paste("Status: ", TRANS_TERM, "<br>Fiscal Year: ", FISCAL_YR, "<br>Count: ", count))) +
geom_point() + # Points and color by group
ggtitle("Analyse the status of the injured employee per Fiscal year") +
us_data_theme +
theme(plot.title = element_text(size = 10, hjust = 0.5, vjust = 4))
# Convert ggplot to plotly with tooltips
p <- ggplotly(p, tooltip = "text")
p
Analyzing Injured Employee Status per Fiscal Year: